library(readr)
library(dplyr)##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(d3heatmap)
library(ggplot2)
C <- read_csv("Group9/brazilian-ecommerce/olist_customers_dataset.csv")## Parsed with column specification:
## cols(
## customer_id = col_character(),
## customer_unique_id = col_character(),
## customer_zip_code_prefix = col_character(),
## customer_city = col_character(),
## customer_state = col_character()
## )
Lo <- read_csv("Group9/brazilian-ecommerce/olist_geolocation_dataset.csv")## Parsed with column specification:
## cols(
## geolocation_zip_code_prefix = col_character(),
## geolocation_lat = col_double(),
## geolocation_lng = col_double(),
## geolocation_city = col_character(),
## geolocation_state = col_character()
## )
It <- read_csv("Group9/brazilian-ecommerce/olist_order_items_dataset.csv")## Parsed with column specification:
## cols(
## order_id = col_character(),
## order_item_id = col_integer(),
## product_id = col_character(),
## seller_id = col_character(),
## shipping_limit_date = col_datetime(format = ""),
## price = col_double(),
## freight_value = col_double()
## )
pay <- read_csv("Group9/brazilian-ecommerce/olist_order_payments_dataset.csv")## Parsed with column specification:
## cols(
## order_id = col_character(),
## payment_sequential = col_integer(),
## payment_type = col_character(),
## payment_installments = col_integer(),
## payment_value = col_double()
## )
re <- read_csv("Group9/brazilian-ecommerce/olist_order_reviews_dataset.csv")## Parsed with column specification:
## cols(
## review_id = col_character(),
## order_id = col_character(),
## review_score = col_integer(),
## review_comment_title = col_character(),
## review_comment_message = col_character(),
## review_creation_date = col_datetime(format = ""),
## review_answer_timestamp = col_datetime(format = "")
## )
order <- read_csv("Group9/brazilian-ecommerce/olist_orders_dataset.csv")## Parsed with column specification:
## cols(
## order_id = col_character(),
## customer_id = col_character(),
## order_status = col_character(),
## order_purchase_timestamp = col_datetime(format = ""),
## order_approved_at = col_datetime(format = ""),
## order_delivered_carrier_date = col_datetime(format = ""),
## order_delivered_customer_date = col_datetime(format = ""),
## order_estimated_delivery_date = col_datetime(format = "")
## )
prod <- read_csv("Group9/brazilian-ecommerce/olist_products_dataset.csv")## Parsed with column specification:
## cols(
## product_id = col_character(),
## product_category_name = col_character(),
## product_name_lenght = col_integer(),
## product_description_lenght = col_integer(),
## product_photos_qty = col_integer(),
## product_weight_g = col_integer(),
## product_length_cm = col_integer(),
## product_height_cm = col_integer(),
## product_width_cm = col_integer()
## )
sell <- read_csv("Group9/brazilian-ecommerce/olist_sellers_dataset.csv")## Parsed with column specification:
## cols(
## seller_id = col_character(),
## seller_zip_code_prefix = col_character(),
## seller_city = col_character(),
## seller_state = col_character()
## )
categ <- read_csv("Group9/brazilian-ecommerce/product_category_name_translation.csv")## Parsed with column specification:
## cols(
## product_category_name = col_character(),
## product_category_name_english = col_character()
## )
unique(order$order_id) %>% length() ### 99441 different order## [1] 99441
unique(order$customer_id) %>% length() ### 99441 customer ID## [1] 99441
duplicated(order$customer_id) %>% table() ### false 99441## .
## FALSE
## 99441
order$order_id %in% re$order_id %>% table() ### TRUE## .
## TRUE
## 99441
###
re$review_id %>% duplicated() %>% table() ### TRUE 827, FALSE 99173## .
## FALSE TRUE
## 99173 827
re$order_id %>% duplicated() %>% table() ### TRUE 559, FALSE 99441## .
## FALSE TRUE
## 99441 559
unique(re$order_id) %>% length() ### 99441## [1] 99441
setdiff(unique(re$order_id),unique(order$order_id)) ### character(0)## character(0)
duplicated(C[,c(1,2)]) %>% table()## .
## FALSE
## 99441
###
order$customer_id %>% unique() %>% length() ### 99411 ## [1] 99441
C$customer_unique_id %>% unique() %>% length() ### 96096## [1] 96096
C$customer_unique_id %>% table %>% table()## .
## 1 2 3 4 5 6 7 9 17
## 93099 2745 203 30 8 6 3 1 1
# 1 2 3 4 5 6 7 9 17
# 93099 2745 203 30 8 6 3 1 1
C[,c(4, 5)] <- lapply(C[,c(4, 5)], factor)
summary(C)## customer_id customer_unique_id customer_zip_code_prefix
## Length:99441 Length:99441 Length:99441
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
##
## customer_city customer_state
## sao paulo :15540 SP :41746
## rio de janeiro: 6882 RJ :12852
## belo horizonte: 2773 MG :11635
## brasilia : 2131 RS : 5466
## curitiba : 1521 PR : 5045
## campinas : 1444 SC : 3637
## (Other) :69150 (Other):19060
C$customer_city %>% table() %>% sort %>% tail(10)## .
## sao bernardo do campo guarulhos salvador
## 938 1189 1245
## porto alegre campinas curitiba
## 1379 1444 1521
## brasilia belo horizonte rio de janeiro
## 2131 2773 6882
## sao paulo
## 15540
# sao bernardo do campo guarulhos salvador porto alegre campinas
# 938 1189 1245 1379 1444
# curitiba brasilia belo horizonte rio de janeiro sao paulo
# 1521 2131 2773 6882 15540
###
setdiff(unique(pay$order_id),unique(order$order_id)) ### character(0)## character(0)
setdiff(unique(order$order_id),unique(pay$order_id)) ### "bfbd0f9bdef84302105ad712db648a6c"## [1] "bfbd0f9bdef84302105ad712db648a6c"
pay$payment_type <- as.factor(pay$payment_type)
unique(pay$order_id) %>% length() ### 99440## [1] 99440
pay$order_id %>% table() %>% table() ## .
## 1 2 3 4 5 6 7 8 9 10 11 12
## 96479 2382 301 108 52 36 28 11 9 5 8 8
## 13 14 15 19 21 22 26 29
## 3 2 2 2 1 1 1 1
pay$payment_sequential %>% table() %>% sum() ### 103886## [1] 103886
# 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 19 21 22 26 29
# 96479 2382 301 108 52 36 28 11 9 5 8 8 3 2 2 2 1 1 1 1
pay[pay$payment_sequential == 2,]$order_id %in% pay[pay$payment_sequential == 1,]$order_id %>% table()## .
## FALSE TRUE
## 80 2959
# FALSE TRUE
# 80 2959
summary(pay)## order_id payment_sequential payment_type
## Length:103886 Min. : 1.000 boleto :19784
## Class :character 1st Qu.: 1.000 credit_card:76795
## Mode :character Median : 1.000 debit_card : 1529
## Mean : 1.093 not_defined: 3
## 3rd Qu.: 1.000 voucher : 5775
## Max. :29.000
## payment_installments payment_value
## Min. : 0.000 Min. : 0.00
## 1st Qu.: 1.000 1st Qu.: 56.79
## Median : 1.000 Median : 100.00
## Mean : 2.853 Mean : 154.10
## 3rd Qu.: 4.000 3rd Qu.: 171.84
## Max. :24.000 Max. :13664.08
# order_id payment_sequential payment_type payment_installments payment_value
# Length:103886 Min. : 1.00 boleto :19784 Min. : 0.00 Min. : 0
# Class :character 1st Qu.: 1.00 credit_card:76795 1st Qu.: 1.00 1st Qu.: 57
# Mode :character Median : 1.00 debit_card : 1529 Median : 1.00 Median : 100
# Mean : 1.09 not_defined: 3 Mean : 2.85 Mean : 154
# 3rd Qu.: 1.00 voucher : 5775 3rd Qu.: 4.00 3rd Qu.: 172
# Max. :29.00 Max. :24.00 Max. :13664
#
### It
It$order_id %>% unique() %>% length() ### 98666 ## [1] 98666
It$order_id %in% order$order_id %>% table() ### TRUE## .
## TRUE
## 112650
order$order_id %in% It$order_id %>% table() ### TRUE 98666 , FALSE 775## .
## FALSE TRUE
## 775 98666
It$order_id %>% table() %>% table() ## .
## 1 2 3 4 5 6 7 8 9 10 11 12
## 88863 7516 1322 505 204 198 22 8 3 8 4 5
## 13 14 15 20 21
## 1 2 2 2 1
# 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 20 21
# 88863 7516 1322 505 204 198 22 8 3 8 4 5 1 2 2 2 1
It[It$order_item_id == 2,]$order_id %in% It[It$order_item_id == 1,]$order_id %>% table() ### TRUE## .
## TRUE
## 9803
# TRUE
# 9803
It$product_id %>% unique() %>% length() ### 32951## [1] 32951
It$seller_id %>% unique() %>% length() ### 3095## [1] 3095
###
prod$product_id %>% unique() %>% length() ### 32951## [1] 32951
setdiff(unique(It$product_id), unique(prod$product_id)) ### character(0)## character(0)
prod$product_category_name %>% unique() %>% length() ### 74## [1] 74
###
categ$product_category_name %>% unique() %>% length() ### 71## [1] 71
setdiff(unique(prod$product_category_name), unique(categ$product_category_name))## [1] NA
## [2] "pc_gamer"
## [3] "portateis_cozinha_e_preparadores_de_alimentos"
### NA , "pc_gamer" , "portateis_cozinha_e_preparadores_de_alimentos"
###
Z = It %>% group_by(seller_id) %>% summarise(
ItemsSold = n(),
Rev = sum(price),
noProd = n_distinct(product_id),
avgPrice = mean(price),
maxPrice = max(price),
minPrice = min(price),
avgFreight = mean(freight_value),
avgRevProd = Rev/noProd,
avgItemsProd = ItemsSold/noProd
)
summary(Z)## seller_id ItemsSold Rev noProd
## Length:3095 Min. : 1.0 Min. : 3.5 Min. : 1.00
## Class :character 1st Qu.: 2.0 1st Qu.: 208.8 1st Qu.: 2.00
## Mode :character Median : 8.0 Median : 821.5 Median : 4.00
## Mean : 36.4 Mean : 4391.5 Mean : 11.13
## 3rd Qu.: 24.0 3rd Qu.: 3280.8 3rd Qu.: 10.00
## Max. :2033.0 Max. :229472.6 Max. :399.00
## avgPrice maxPrice minPrice avgFreight
## Min. : 3.50 Min. : 3.5 Min. : 0.85 Min. : 1.20
## 1st Qu.: 52.18 1st Qu.: 81.0 1st Qu.: 21.64 1st Qu.: 14.74
## Median : 95.47 Median : 169.9 Median : 44.80 Median : 18.23
## Mean : 176.32 Mean : 335.4 Mean : 110.39 Mean : 23.38
## 3rd Qu.: 173.99 3rd Qu.: 349.9 3rd Qu.: 99.00 3rd Qu.: 24.37
## Max. :6729.00 Max. :6735.0 Max. :6729.00 Max. :308.34
## avgRevProd avgItemsProd
## Min. : 3.5 Min. : 1.000
## 1st Qu.: 89.0 1st Qu.: 1.000
## Median : 179.8 Median : 1.667
## Mean : 417.5 Mean : 2.669
## 3rd Qu.: 397.7 3rd Qu.: 2.726
## Max. :16983.5 Max. :128.333
###
X = unique(It[,c(1,4)]) %>% left_join(re[,2:3]) %>%
group_by(seller_id) %>% summarise(
noReview = n(),
avgScore = mean(review_score),
minScore = min(review_score),
maxScore = max(review_score)) ## Joining, by = "order_id"
###
Z = Z %>% left_join(X) %>% left_join(sell)## Joining, by = "seller_id"
## Joining, by = "seller_id"
pacman::p_load(dplyr, FactoMineR, factoextra)ZZ <- Z[,c(1,2,3,4,5,8,9,10,11,12,15,16,17)]library(readr)
library(dplyr)
library(d3heatmap)
library(ggplot2)
library(caTools)
## library(chorddiag)
set.seed(1234)
ZZ$grp = kmeans(scale(ZZ[,c(3,5,6,9,10)]),5)$cluster %>% factor
ZZ$grp %>% table()## .
## 1 2 3 4 5
## 23 1123 1526 138 285
# Z$group = factor(cutree(hc, k=4))
#
# fviz_dend(
# hc, k=8, show_labels=F, rect=T, rect_fill=T,
# labels_track_height=0,
# palette="ucscgb", rect_border="ucscgb")pacman::p_load(dplyr, FactoMineR, factoextra)
pca = PCA(ZZ[,c(3,5,6,9,10)])fviz_pca_ind(pca)grp <- ZZ$grp
fviz_pca_biplot(
pca, repel=T, col.var="black",
col.ind=grp, alpha.ind=0.6, pointshape=16)# grp <- ZZ$grp
#
# fviz_pca_biplot(
# pca, repel=T, col.var="black",
# col.ind=grp, alpha.ind=0.6, pointshape=16,
# addEllipses = TRUE, ellipse.level = 0.6, mean.point = FALSE)group_by(ZZ, grp) %>% summarise(
REV=mean(Rev),
avgPrice=mean(avgPrice),
avgFreight=mean(avgFreight),
noReview=mean(noReview),
avgScore=mean(avgScore),
size=n() )## # A tibble: 5 x 7
## grp REV avgPrice avgFreight noReview avgScore size
## <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <int>
## 1 1 128544. 162. 20.9 973. 4.07 23
## 2 2 6988. 152. 22.3 54.5 3.78 1123
## 3 3 995. 111. 19.1 9.78 4.58 1526
## 4 4 8221. 1148. 83.2 8.67 4.03 138
## 5 5 475. 152. 22.1 2.94 1.59 285
### 價格高的顧客願意等???ZZZ<- ZZ[,c(3,5,6,9,10)]
colMeans(ZZZ)## Rev avgPrice avgFreight noReview avgScore
## 4391.484233 176.325102 23.380116 32.497577 3.982833
ZA<- scale(ZZZ) %>% data.frame
par(cex=0.8)
split(ZA, grp) %>% sapply(colMeans) %>% barplot(beside=T,col=rainbow(5))
legend('topright',legend=colnames(ZZZ),fill=rainbow(5))load("~/Downloads/II.RData")II2 <- II %>% filter(grp== 1|grp == 4| grp== 2)
II2$grp <- II2$grp %>% as.numeric() %>% as.factor()
table(II2$grp, II2$category) %>% as.data.frame.matrix %>%
d3heatmap(F,F,col=cm.colors(13)[3:13])table(II$category)##
## agro_industry_and_commerce
## 14
## air_conditioning
## 19
## art
## 12
## arts_and_craftmanship
## 6
## audio
## 5
## auto
## 227
## baby
## 92
## bed_bath_table
## 124
## books_general_interest
## 37
## books_imported
## 2
## books_technical
## 18
## christmas_supplies
## 7
## cine_photo
## 2
## computers
## 6
## computers_accessories
## 158
## consoles_games
## 24
## construction_tools_construction
## 59
## construction_tools_lights
## 17
## construction_tools_safety
## 15
## cool_stuff
## 83
## costruction_tools_garden
## 9
## costruction_tools_tools
## 9
## diapers_and_hygiene
## 5
## drinks
## 20
## dvds_blu_ray
## 3
## electronics
## 38
## fashio_female_clothing
## 3
## fashion_bags_accessories
## 58
## fashion_male_clothing
## 4
## fashion_shoes
## 4
## fashion_sport
## 3
## fashion_underwear_beach
## 2
## fixed_telephony
## 13
## flowers
## 1
## food
## 12
## food_drink
## 16
## furniture_bedroom
## 12
## furniture_decor
## 208
## furniture_living_room
## 25
## furniture_mattress_and_upholstery
## 1
## garden_tools
## 102
## health_beauty
## 282
## home_appliances
## 20
## home_appliances_2
## 17
## home_confort
## 3
## home_construction
## 22
## housewares
## 230
## industry_commerce_and_business
## 9
## kitchen_dining_laundry_garden_furniture
## 22
## la_cuisine
## 1
## luggage_accessories
## 33
## market_place
## 8
## music
## 4
## musical_instruments
## 38
## office_furniture
## 23
## party_supplies
## 5
## perfumery
## 92
## pet_shop
## 84
## signaling_and_security
## 14
## small_appliances
## 41
## small_appliances_home_oven_and_coffee
## 5
## sports_leisure
## 272
## stationery
## 60
## tablets_printing_image
## 2
## telephony
## 55
## toys
## 118
## watches_gifts
## 52
table(II$grp, II$category) %>% as.data.frame.matrix %>%
d3heatmap(F,F,col=cm.colors(13)[3:13])## II %>% group_by(grp)
II4 <- II %>% filter(grp == 3)
II4$category %>% table %>% sort(decreasing = TRUE) ## .
## sports_leisure
## 94
## auto
## 90
## furniture_decor
## 89
## housewares
## 81
## health_beauty
## 78
## computers_accessories
## 60
## bed_bath_table
## 44
## toys
## 33
## garden_tools
## 32
## baby
## 29
## perfumery
## 28
## telephony
## 26
## cool_stuff
## 23
## pet_shop
## 23
## fashion_bags_accessories
## 21
## electronics
## 20
## stationery
## 19
## construction_tools_construction
## 15
## small_appliances
## 15
## watches_gifts
## 14
## furniture_living_room
## 11
## books_general_interest
## 10
## home_construction
## 10
## consoles_games
## 9
## fixed_telephony
## 9
## home_appliances
## 9
## home_appliances_2
## 9
## musical_instruments
## 9
## construction_tools_lights
## 8
## kitchen_dining_laundry_garden_furniture
## 8
## office_furniture
## 7
## air_conditioning
## 6
## food_drink
## 6
## signaling_and_security
## 6
## construction_tools_safety
## 5
## drinks
## 5
## luggage_accessories
## 5
## costruction_tools_garden
## 4
## market_place
## 4
## agro_industry_and_commerce
## 3
## furniture_bedroom
## 3
## art
## 2
## audio
## 2
## books_technical
## 2
## christmas_supplies
## 2
## costruction_tools_tools
## 2
## diapers_and_hygiene
## 2
## fashio_female_clothing
## 2
## fashion_male_clothing
## 2
## fashion_sport
## 2
## fashion_underwear_beach
## 2
## food
## 1
## home_confort
## 1
## la_cuisine
## 1
## party_supplies
## 1
## small_appliances_home_oven_and_coffee
## 1
# auto garden_tools health_beauty sports_leisure furniture_decor
# 13 12 10 8 7
# baby computers_accessories home_appliances_2 pet_shop cool_stuff
# 6 6 6 6 5 R <- re
R2 <- R[,c(2, 4, 5)]
R2 <- left_join(R2, I[,c(1,4)])## Joining, by = "order_id"
R2 <- left_join(R2[,c(4,1,2,3)], ZZ[,c(1,14)])## Joining, by = "seller_id"
R21 <- R2 %>% filter(grp == 1)library(rvest)## Loading required package: xml2
##
## Attaching package: 'rvest'
## The following object is masked from 'package:readr':
##
## guess_encoding
library(tm)## Loading required package: NLP
##
## Attaching package: 'NLP'
## The following object is masked from 'package:ggplot2':
##
## annotate
library(SnowballC)
library(wordcloud)## Loading required package: RColorBrewer
library(RColorBrewer)
docs <- Corpus(VectorSource(R21$review_comment_message))
# 將內容以語料庫形式儲存docs <- tm_map(docs, content_transformer(tolower)) ## Warning in tm_map.SimpleCorpus(docs, content_transformer(tolower)):
## transformation drops documents
docs <- tm_map(docs, removeNumbers) #移除數字## Warning in tm_map.SimpleCorpus(docs, removeNumbers): transformation drops
## documents
docs <- tm_map(docs, removeWords,stopwords("portuguese")) # 移除常見的轉折詞彙## Warning in tm_map.SimpleCorpus(docs, removeWords, stopwords("portuguese")):
## transformation drops documents
docs <- tm_map(docs, removePunctuation) #移除標點符號## Warning in tm_map.SimpleCorpus(docs, removePunctuation): transformation
## drops documents
docs <- tm_map(docs, stripWhitespace) # 移除額外的空白## Warning in tm_map.SimpleCorpus(docs, stripWhitespace): transformation drops
## documents
dtm <- TermDocumentMatrix(docs)
m <- as.matrix(dtm)
v <- sort(rowSums(m),decreasing=TRUE)
d<- data.frame(word=names(v), freq=v)
head(d,50)## word freq
## produto produto 4698
## prazo prazo 2209
## entrega entrega 1762
## recebi recebi 1709
## chegou chegou 1596
## antes antes 1567
## bom bom 1203
## recomendo recomendo 1098
## veio veio 1078
## entregue entregue 1075
## comprei comprei 1050
## qualidade qualidade 881
## bem bem 854
## ainda ainda 672
## loja loja 671
## compra compra 610
## gostei gostei 591
## tudo tudo 568
## produtos produtos 493
## super super 486
## ótimo ótimo 475
## apenas apenas 464
## pedido pedido 461
## excelente excelente 456
## boa boa 424
## dia dia 398
## dentro dentro 384
## rápida rápida 384
## relógio relógio 353
## nao nao 339
## lannister lannister 337
## site site 313
## dois dois 310
## porém porém 310
## pois pois 305
## agora agora 290
## sempre sempre 285
## ser ser 280
## conforme conforme 278
## pra pra 278
## lindo lindo 272
## correios correios 268
## nota nota 259
## duas duas 258
## ótima ótima 254
## rápido rápido 249
## perfeito perfeito 245
## parabéns parabéns 234
## dias dias 233
## adorei adorei 232
wordcloud(words=d$word,freq=d$freq,min.freq=2,
max.words=150,random.order=FALSE,rot.per=0.35,
colors=brewer.pal(35,"Dark2"))## Warning in brewer.pal(35, "Dark2"): n too large, allowed maximum for palette Dark2 is 8
## Returning the palette you asked for with that many colors
Lo$geolocation_lat <- round(Lo$geolocation_lat,3)
Lo$geolocation_lng <- round(Lo$geolocation_lng,3)unique(sell$seller_zip_code_prefix) %>% length() ### 2246## [1] 2246
names(Lo)[1] <- "seller_zip_code_prefix"
sell$seller_zip_code_prefix %>% unique() %>% length() ## [1] 2246
sell_lo <- left_join(sell, Lo, by = "seller_zip_code_prefix")sell_lo2 <- sell_lo %>% group_by(seller_id, seller_zip_code_prefix, geolocation_state) %>% summarise(lat = mean(geolocation_lat), lng = mean(geolocation_lng))
sell_lo2 <- sell_lo2[!duplicated(sell_lo2$seller_id),]
sell_lo2 <- left_join(sell_lo2, ZZ[,c(1, 5, 10, 14)])## Joining, by = "seller_id"
table(sell_lo2$grp)##
## 1 2 3 4 5
## 23 1123 1526 138 285
library(lubridate)##
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
##
## date
library(Imap)
library(maps)
Brazil <- map_data("world")%>%filter(region=="Brazil")ggplot() +
geom_polygon(data = Brazil, aes(x= long, y= lat, group= group), fill="black")+
geom_point(data= sell_lo2 , aes(x= lng, y= lat, color= grp), size = 2, alpha = 0.2)## Warning: Removed 7 rows containing missing values (geom_point).
sell_lo22 <- sell_lo2 %>% filter(grp== 5)
ggplot() +
geom_polygon(data = Brazil, aes(x= long, y= lat, group= group), fill="black") + geom_point(data= sell_lo22,aes(x= lng, y= lat, color= grp), size = 2, alpha = 0.2)## Warning: Removed 1 rows containing missing values (geom_point).
Add a new chunk by clicking the Insert Chunk button on the toolbar or by pressing Cmd+Option+I.
When you save the notebook, an HTML file containing the code and output will be saved alongside it (click the Preview button or press Cmd+Shift+K to preview the HTML file).
The preview shows you a rendered HTML copy of the contents of the editor. Consequently, unlike Knit, Preview does not run any R code chunks. Instead, the output of the chunk when it was last run in the editor is displayed.